#-*- coding:utf-8 -*- 
 
from whoosh.index import create_in  
from whoosh.fields import *  
from chinesetokenizer import ChineseAnalyzer
#from whoosh.analysis import RegexAnalyzer  
#analyzer = RegexAnalyzer(ur"([\u4e00-\u9fa5])|(\w+(\.?\w+)*)")

analyzer = ChineseAnalyzer()
f = formats.Frequency();
schema = Schema(appid = ID(stored=True), content=TEXT(stored=True, analyzer=analyzer,vector=f))  
ix = create_in("index", schema)  

writer = ix.writer()
f = open('app_info.p');
for line in f:
	
	li = line.split('\t');
	#print li;

	writer.add_document(appid=li[0].decode('utf-8'),content=li[1].decode('utf-8'));  
writer.commit()

f.close();

